In [1]:
%matplotlib inline
import gzip
import json
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict, Counter
In [2]:
%%time
data = []
media_types = defaultdict(int)
url_types = defaultdict(int)
unique_urls = set()
with gzip.open("all_ids.txt.json.gz") as fp:
for line in fp:
d = json.loads(line.strip())
data.append(d)
if 'entities' not in d:
continue
if 'media' in d['entities']:
m_entities = d['entities']['media']
for m in m_entities:
m_type = m['type']
media_types[m_type] += 1
if 'urls' in d['entities']:
m_entities = d['entities']['urls']
for m in m_entities:
media_types['url'] += 1
m = m['expanded_url']
m_type = m.split("/", 3)[2]
unique_urls.add((m, m_type))
url_types[m_type] += 1
print(media_types)
url_types = Counter(url_types)
print(len(url_types), len(unique_urls))
In [3]:
url_types.most_common(50)
Out[3]:
In [4]:
sorted(unique_urls,
key=lambda x: url_types[x[1]],
reverse=True)[:10]
Out[4]:
with open("all_urls.txt", "wb+") as fp:
for url in sorted(filter(lambda x: x[1] != 'twitter.com',
unique_urls),
key=lambda x: url_types[x[1]],
reverse=True):
print >> fp, "%s\t%s\t%s" % (url[0], url[1], url_types[url[1]])
! head all_urls.txt
# python download_expanded.py --jobs 20 --batches 200 # Run this to expand URLs
In [6]:
! head exp_urls.txt
In [7]:
data[0].keys()
Out[7]:
In [8]:
data[0][u'source']
Out[8]:
In [9]:
data[0][u'is_quote_status']
Out[9]:
In [10]:
data[0][u'quoted_status']['text']
Out[10]:
In [11]:
data[0]['text']
Out[11]:
In [12]:
count_quoted = 0
has_coordinates = 0
count_replies = 0
language_ids = defaultdict(int)
count_user_locs = 0
user_locs = Counter()
count_verified = 0
for d in data:
count_quoted += d.get('is_quote_status', 0)
coords = d.get(u'coordinates', None)
repl_id = d.get(u'in_reply_to_status_id', None)
has_coordinates += (coords is not None)
count_replies += (repl_id is not None)
loc = d['user'].get('location', u'')
count_verified += d['user']['verified']
if loc != u'':
count_user_locs += 1
user_locs.update([loc])
language_ids[d['lang']] += 1
print count_quoted, has_coordinates, count_replies, count_user_locs, count_verified
In [13]:
count_verified
Out[13]:
In [14]:
user_locs.most_common(10)
Out[14]:
In [15]:
len(data)
Out[15]:
In [16]:
data[0]['user']
Out[16]:
In [ ]: